MODEL COMPUTATION (reference and validation data pre processing)

Library loading, function definition and gene annotation

library(limma)
library(DESeq2)
library(biomaRt)
library(org.Hs.eg.db)
library(readxl)
library(reshape2)
library(ggplot2)
library(ggfortify)
library(ggpubr)
library(caret)
library(steFunctions)
library(EnsDb.Hsapiens.v79)
library(EnsDb.Hsapiens.v75)
library(heatmap3)
library(tidyverse)
library(ComplexHeatmap)
library(circlize)
library(doMC)
library(pROC)

#define function to get color gradient 

getGradientColors <- function(colValue, minRange = NULL, maxRange = NULL, ccolors = c('green3', 'red'), bbreak = 0.01){
  library(heatmap3)
  if(is.null(minRange)){
    minRange <- min(colValue, na.rm = T) - 0.001
  }
  if(is.null(maxRange)){
    maxRange <- max(colValue, na.rm = T) + 0.001
  }
  bre <- seq(minRange, maxRange, bbreak) 
  colori <- colByValue(colValue, col = colorRampPalette(ccolors)(length(bre)-1), breaks= bre, cex.axis = 0.8)
  colori <- as.vector(colori)
  graphics.off()
  names(colori) <- names(colValue)
  return(colori)
}

MEDIAsave="/home/mclaudia/MEDIA Project/Codici_finali/"
wdd="/home/mclaudia/MEDIA Project/OAStratification-pipeline/"
MEDIAgraph="/home/mclaudia/MEDIA Project/Codici_finali/Paper_figures/"

load(paste0(MEDIAsave,"genide-bulk-unpaired_DE.RData"),verbose=TRUE) #for TPM duplicate fix
## Loading objects:
##   OAvsnonOA
up_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=1)
dw_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=2)

ensdf <- GenomicFeatures::genes(EnsDb.Hsapiens.v79, return.type="DataFrame")
ens2gene <- as.data.frame(ensdf[,c("gene_id","gene_name")])

ensdf19 <- GenomicFeatures::genes(EnsDb.Hsapiens.v75, return.type="DataFrame")
ens2gene19 <- as.data.frame(ensdf19[,c("gene_id","gene_name")])

We use as reference Dataset 3 (Unpaired)

TPM matrix has been collected from txi-import file from Soul et al., 2017

load(paste0(wdd,"data/txi.RData"))

patientDetails<-read.csv(paste0(wdd,"data/patientDetails_all_withMed.csv"),stringsAsFactors = F)
patientDetails <-patientDetails[,-3]
patientDetails<-patientDetails[!duplicated(patientDetails$name),]
patientDetails$OA<-ifelse(grepl("SH0", patientDetails$name),"OA","NonOA")

tmp <- txi$abundance[,match(unique(as.character(patientDetails$name)),colnames(txi$abundance))]  #collect TPM matrix
keep <- rowSums(tmp > 0) >= 35   #TPMs > 0 in at least 50% of the samples
tmp <-tmp[keep,]

tmp <-cbind(gene_id=rownames(tmp), tmp) 
tmp<-merge(tmp,ens2gene,by.y="gene_id")
tmp <-cbind(tmp$gene_name, tmp[,-72])
colnames(tmp)[1] <-"gene_name"
rownames(tmp) <-tmp$gene_id

dupp <-tmp[duplicated(tmp$gene_name),]
gn_dup <-unique(dupp[,1]) 

tpm.mat_maintain <-tmp[!tmp$gene_name %in% gn_dup,] 

tpm2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(tpm.mat_maintain[,-c(1,2)])))
colnames(tpm2) <-colnames(tpm.mat_maintain)[-c(1,2)]
rownames(tpm2) <-paste0("ENS_",1:length(gn_dup))

for(i in 1:length(gn_dup)){
  x=gn_dup[i]
  y=tmp[tmp[,1] %in% x,-c(1,2)]
  tpm2[i,1:ncol(tpm2)] <-apply(y,2,function(x) round(mean(as.numeric(x)),3))
}

tpm2 <-cbind(gene_name=gn_dup,tpm2) 
tpm2 <-tpm2[,colnames(tpm.mat_maintain)[-2]]
tpm3 <-rbind(tpm.mat_maintain[,-2],as.matrix(tpm2))

rownames(tpm3) <-tpm3[,1]

tpm.mat <-t(apply(tpm3[,-1],1,as.numeric))
colnames(tpm.mat) <-colnames(tpm3)[-1]
dim(tpm.mat)
## [1] 17804    70
train_tpm <-log2(tpm.mat+0.001)
boxplot(train_tpm)

Load Validation dataset from GSE114007 (see vignette 4)

load(paste0(MEDIAsave,"datasetValidation_full.RData"),verbose = TRUE)
## Loading objects:
##   newdataset

Download gene length for TPM trasformation for the Validation dataset

tmp<-newdataset
genes <- rownames(tmp)


geneAnn <-ens2gene19[ens2gene19$gene_name %in% genes,]
out <- as.data.frame(EDASeq::getGeneLengthAndGCContent(id = geneAnn$gene_id, org = 'hg19',mode = "org.db"))
out$gene_id <-rownames(out)
out <-merge(geneAnn,out,by="gene_id")
out <-out[complete.cases(out),]
out <-out[!duplicated(out$gene_name),]  
rownames(out) <-out$gene_name
gene_length <-out[,3,drop=FALSE]
tmp <-tmp[rownames(gene_length),]
dim(tmp)
## [1] 20685    38
identical(rownames(tmp),rownames(gene_length)) #TRUE
## [1] TRUE
y <- apply(tmp,2,function(x) x/as.numeric(gene_length[,1]))
tpm.mat2 <- t((t(y) * 1e6)/colSums(y))
head(colSums(tpm.mat2))
## Normal_Cart_10_8  Normal_Cart_2_2  Normal_Cart_3_3  Normal_Cart_4_4 
##            1e+06            1e+06            1e+06            1e+06 
##  Normal_Cart_5_5  Normal_Cart_6_6 
##            1e+06            1e+06
keep <- rowSums(tpm.mat2 > 0) >= 19   #TPMs > 0 in at least 50% of the samples
tpm.mat2 <-tpm.mat2[keep,]


validation_tpm <-log2(tpm.mat2+0.001)

boxplot(validation_tpm)

rm(y)

Rescale validation TPM-log2 data based on reference

1. Intersect reference genes with the ones of the sample of interest (or validation dataset)

2. quantile normalize the reference dataset

ii=intersect(rownames(train_tpm),rownames(validation_tpm)) #select common genes,14565

train_tpm <-train_tpm[ii,]
validation_tpm <-validation_tpm[ii,]

dim(train_tpm)
## [1] 14565    70
dim(validation_tpm)
## [1] 14565    38
train_tpm <-t(quantileNormalization(t(train_tpm)))

list_ranks <- as.numeric(sort(train_tpm[,1],decreasing=TRUE))

head(list_ranks)
## [1] 15.90471 14.80488 14.42682 14.01457 13.75508 13.53887

3. rescale the sample of interest for the quantile interval (or validation dataset)

list_val <-vector("list",ncol(validation_tpm))
names(list_val) <-colnames(validation_tpm)

list_val <-apply(validation_tpm, 2, function(x) as.list(sort(x,decreasing = TRUE)))

list_valRank <-vector("list",ncol(validation_tpm))
names(list_valRank) <-colnames(validation_tpm)


for(i in 1:length(list_val)){
  gg <-names(unlist(list_val[[i]]))
  list_valRank[[i]] <-list_ranks
  names(list_valRank[[i]]) <- gg
}

normdata2 <-matrix(NA,nrow = nrow(validation_tpm), ncol=ncol(validation_tpm))
rownames(normdata2) <-rownames(validation_tpm)
colnames(normdata2) <-colnames(validation_tpm)

for(i in 1:length(list_valRank)){
  tmp <-as.data.frame(list_valRank[[i]])
  rownames(tmp) <-names(list_valRank[[i]])
  normdata2[,i] <-tmp[rownames(normdata2),]
}
boxplot(normdata2)  #Validation re-scaled on quantile normalized reference

ELASTIC NET model (select most informative features)

  1. Run the model 100 times (100 x 20-elements samples, 10 random OA and 10 healthy samples)
  2. After, select the features occurred at least the 50% of times
  3. Compute mean coefficient values
  4. Compute scores per patient (linear combination) with the selected features gene expression TPM values
train_tpm_forRidge <- train_tpm
validation_tpm_forRidge <- normdata2

validation_tpm <-normdata2[rownames(normdata2) %in% c(up_genes$gene,dw_genes$gene),] #select consensus genes present
dim(validation_tpm)
## [1] 43 38
train_tpm <-train_tpm[rownames(train_tpm) %in% rownames(validation_tpm),]  
dim(train_tpm)
## [1] 43 70

Set 100 OA sample runs: balance data for normal samples (bootstrap)

set.seed(7894)
runs <-vector("list",100)
names(runs) <-paste0("run_",1:100)
runs <-lapply(runs,function(x) x <- sample(11:70, 10, replace = FALSE))

subsets <-vector("list",100)
names(subsets) <-paste0("run_",1:100)

for(i in 1:length(subsets)) subsets[[i]] <-train_tpm[,c(1:10,as.numeric(runs[[i]]))]  #create subsets

Train

Tune the model

trainCtrl.rpcv <- trainControl(method = "LOOCV", number = 1,
                               verboseIter = TRUE, returnData = FALSE,
                               savePredictions = TRUE,        
                               classProbs = TRUE)

ClassBoot=as.factor(c(rep("N",10),rep("OA",10)))

list_tunes <-vector("list",100)
names(list_tunes) <-paste0("tunes_",1:100)

Train the model for 100 runs

nCores <- 150 #change number of threads if needed

registerDoMC(nCores)
for(i in 1:length(subsets)){
  train_dataset <-t(subsets[[i]])
  train <- train(train_dataset,
                           y = ClassBoot,
                           method = "glmnet", 
                           trControl = trainCtrl.rpcv,
                           metric = "Accuracy", allowParallel=TRUE,
                           tuneGrid = expand.grid(alpha = 0.5,  #for elastic net, median between 0 and 1
                                                  lambda = seq(0.001,1,by = 0.001)))
  list_tunes[[i]] <- coef(train$finalModel, train$finalModel$lambdaOpt)
}

Assess presence of each feature for each run

table_occurrence <-matrix(0,100,length(rownames(train_tpm)))
colnames(table_occurrence) <-rownames(train_tpm)
rownames(table_occurrence) <-paste0("run_",1:100)

for(i in 1:nrow(table_occurrence)){
  tmp=list_tunes[[i]][,1][-1]
  table_occurrence[i,names(tmp)] <-tmp
}
meanvalues <-table_occurrence

table_occurrence[table_occurrence != 0] <-1

tbO <-melt(table_occurrence)

ggplot(tbO, aes(y=Var2, x=Var1, fill=factor(value,levels = c("1","0"))))+
  geom_tile()+
  theme(axis.text.y = element_text(size=13))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size=7))+
  xlab("Runs")+
  ylab("Genes")+
  scale_fill_manual(name="value",values=c("1"="red", "0"="blue"))

(selected <-sort(colSums(table_occurrence)[colSums(table_occurrence) >= 50],decreasing = TRUE))
## TNFSF11   LOXL3    DNER  TSPAN2   THBS3    ASPN   HTRA1    DYSF 
##     100      98      98      94      94      85      68      62

Select features with at least the 50% of occurrence and compute mean coefficient values across the runs

selected1 <-names(selected)

meanvalues <-meanvalues[,selected1]
(cf <-colMeans(meanvalues))
##    TNFSF11      LOXL3       DNER     TSPAN2      THBS3       ASPN      HTRA1 
## 0.14553148 0.08586524 0.17203360 0.04224252 0.10231979 0.03194829 0.02917054 
##       DYSF 
## 0.03668216
barplot(sort(cf,decreasing = TRUE), col = "dodgerblue", cex.names=1.2,font = 2, yaxt = "n") 
axis(side = 2) 

Compute score (sR) for Reference dataset , exploring also the distribution of this score across the patients

ClassTRfull=as.factor(c(rep("N",10),rep("OA",60)))
score_df <-data.frame(patient=colnames(train_tpm), score=NA, class=ClassTRfull)

train_sc <-t(train_tpm[names(cf),])

sc <-rowSums(t(apply(train_sc,1,function(x) x*cf))) #compute score

rownames(score_df) <-score_df[,1]
score_df <-score_df[names(sc),]
score_df$score <-sc

score_df_train <-score_df
rm(score_df)
colnames(score_df_train)<-c("Patient","Score","Class")

ggplot(score_df_train, aes(x=Score,fill=Class)) +
  geom_density(alpha=.7)+
  xlab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  theme_bw()

ggplot(score_df_train, aes(y=Score,x=Class, fill=Class))+
  geom_boxplot(alpha=.6)+
  stat_compare_means(cex=7,label.x=0.75,label.y = 3)+
  geom_jitter((aes(colour = Class))) +
  ylab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  scale_color_manual(values = c("deeppink","turquoise"))+
  theme_bw()+
  theme(axis.text.y = element_text(size = 15),
        axis.text.x = element_text(size = 15),
        legend.title=element_text(size=16), 
        legend.text=element_text(size=14))+
  guides(color = guide_legend(override.aes = list(size = 2)))

How does the sR discriminate between healthy and affected patients? REFERENCE

col1 <-getGradientColors(score_df_train$Score, ccolors = c("deepskyblue1","dodgerblue","dodgerblue3","navy"), maxRange = max(score_df_train$Score)+ 0.01, minRange = min(score_df_train$Score)-0.01)

score_df_train$col1 <-col1
score_df_train <-score_df_train[order(score_df_train$Patient),]

score_df_train$col2 <-c(rep("plum",10),rep("cyan",60))

sum(up_genes$gene %in% rownames(train_tpm))
## [1] 38
sum(dw_genes$gene %in% rownames(train_tpm))
## [1] 5
sel <-c(up_genes$gene,dw_genes$gene)[c(up_genes$gene,dw_genes$gene) %in%  rownames(train_tpm)]  #43
train_tpm <-train_tpm[sel,] 

annGenes <-data.frame(genes=sel, col3=c(rep("red",38),rep("blue",5)))

summ=summary(score_df_train$Score)
col_fun = colorRamp2(c(summ[1],summ[2],summ[5],summ[6]), c("#00BFFF","#197CDD","#0F4DB3","#000080"))

score_df_train <-score_df_train[order(score_df_train$Score,decreasing = FALSE),]  #order the patients for increasing Score
head(score_df_train)
##        Patient    Score Class    col1 col2
## MRI009  MRI009 1.340591     N #00BFFF plum
## MRI008  MRI008 1.443142     N #03B9FF plum
## MRI002  MRI002 1.489666     N #04B7FF plum
## MRI005  MRI005 1.532074     N #06B5FF plum
## MRI004  MRI004 1.672486     N #0AAEFF plum
## MRI006  MRI006 1.769805     N #0DAAFF plum

Heatmap: z-score computation of the data with patients sorted by increasing score

train_tpm_scal <-t(apply(train_tpm,1,scale))
rownames(train_tpm_scal) <-rownames(train_tpm)
colnames(train_tpm_scal) <-colnames(train_tpm)

train_tpm_scal[train_tpm_scal < -3] <- -3
train_tpm_scal[train_tpm_scal > 3] <- 3

heatmap3(train_tpm_scal[annGenes$genes,rownames(score_df_train)],
         Colv=NA,
         scale="none",
         col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
         RowAxisColors = 1,
         balanceColor = TRUE,
         margins = c(15,15),
         ColSideColors = as.matrix(score_df_train[,c(4,5)]),
         ColSideLabs = c("score R","Status"),
         RowSideColors = annGenes$col3,
         RowSideLabs = "Consensus",
         cexRow = 1, cexCol = 1)
legend(0.4,1,legend=c("OA","Normal"),cex=2,
       fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.5,1,legend=c("UP","DOWN"),cex=2,
       fill=c("red","blue"), bty = "n", title = "DE")
draw(Legend(col_fun = col_fun, title = "score R"), x = unit(19, "in"), y = unit(13, "in"))

Compute score (sR) for Validation dataset, exploring also the distribution of this score across the patients

ClassV=factor(c(rep("N",18),rep("OA",20)))
score_df <-data.frame(patient=colnames(validation_tpm), score=NA, class=ClassV)

val_sc <-t(validation_tpm[names(cf),])
identical(names(cf), colnames(val_sc)) #TRUE
## [1] TRUE
sc <-rowSums(t(apply(val_sc,1,function(x) x*cf)))

rownames(score_df) <-score_df[,1]
score_df <-score_df[names(sc),]
score_df$score <-sc
head(sc)
## Normal_Cart_10_8  Normal_Cart_2_2  Normal_Cart_3_3  Normal_Cart_4_4 
##         2.665540         3.051013         2.073371         3.084911 
##  Normal_Cart_5_5  Normal_Cart_6_6 
##         1.887718         2.122148
score_df_val <-score_df
rm(score_df)
colnames(score_df_val)<-c("Patient","Score","Class")

ggplot(score_df_val, aes(x=Score,fill=Class)) +
  geom_density(alpha=.7)+
  xlab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  theme_bw()

ggplot(score_df_val, aes(y=Score,x=Class, fill=Class))+
  geom_boxplot(alpha=.6)+
  stat_compare_means(cex=7,label.x=0.75,label.y = 4)+
  geom_jitter((aes(colour = Class))) +
  ylab("score R")+
  scale_fill_manual(values = c("plum","cyan"))+
  scale_color_manual(values = c("deeppink","turquoise"))+
  theme_bw()+
  theme(axis.text.y = element_text(size = 15),
        axis.text.x = element_text(size = 15),
        legend.title=element_text(size=16), 
        legend.text=element_text(size=14))+
  guides(color = guide_legend(override.aes = list(size = 2)))

How does the sR discriminate between healthy and affected patients? VALIDATION

col1 <-getGradientColors(score_df_val$Score, ccolors = c("deepskyblue1","dodgerblue","dodgerblue3","navy"), maxRange = max(score_df_val$Score)+ 0.01, minRange = min(score_df_val$Score)-0.01)
score_df_val$col1 <-col1
score_df_val <-score_df_val[order(score_df_val$Patient),]

score_df_val$col2 <-c(rep("plum",18),rep("cyan",20))

validation_tpm <-validation_tpm[sel,] 
summ2=summary(score_df_val$Score)
col_fun2 = colorRamp2(c(summ2[1],summ2[2],summ2[5],summ2[6]), c("#00BFFF","#1C88F2","#0D40AB","#000080"))

score_df_val <-score_df_val[order(score_df_val$Score,decreasing = FALSE),] #order the patients for increasing Score
head(score_df_val)
##                         Patient     Score Class    col1 col2
## normal_06             normal_06 0.6873741     N #00BFFF plum
## normal_02             normal_02 1.1900648     N #0BACFF plum
## normal_04             normal_04 1.2291431     N #0CABFF plum
## normal_10             normal_10 1.4077748     N #10A5FF plum
## Normal_Cart_5_5 Normal_Cart_5_5 1.8877178     N #1B93FF plum
## normal_07             normal_07 1.9662685     N #1D91FF plum

Heatmap: z-score computation of the data with patients sorted by increasing score

validation_tpm_scal <-t(apply(validation_tpm,1,scale))
rownames(validation_tpm_scal) <-rownames(validation_tpm)
colnames(validation_tpm_scal) <-colnames(validation_tpm)

validation_tpm_scal[validation_tpm_scal < -3] <- -3
validation_tpm_scal[validation_tpm_scal > 3] <- 3

heatmap3(validation_tpm_scal[annGenes$genes,rownames(score_df_val)],
         Colv=NA,
         scale="none",
         col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
         RowAxisColors = 1,
         balanceColor = TRUE,
         margins = c(10,10),
         ColSideColors = as.matrix(score_df_val[,c(4,5)]),
         ColSideLabs = c("score R","Status"),
         RowSideColors = annGenes$col3,
         RowSideLabs = "Consensus",
         cexRow = 1, cexCol = 1)
legend(0.4,1,legend=c("OA","Normal"),cex=2,
       fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.5,1,legend=c("UP","DOWN"),cex=2,
       fill=c("red","blue"), bty = "n", title = "DE")
draw(Legend(col_fun = col_fun2, title = "score R"), x = unit(19, "in"), y = unit(13, "in"))

Performance analysis: ROC curve on Validation dataset for sR score

score_df_val <-score_df_val[order(score_df_val$Class),]
rocobj <- roc(ClassV,score_df_val$Score) 

auc <-round(rocobj$auc,3)

ggroc(rocobj, size = 2, col="dodgerblue",legacy.axes = TRUE) +
  theme_bw()+
  ggtitle(paste0('ROC Curve for score R (sR) on Validation dataset ', '(AUC = ', auc, ')'))+
  labs(x = "1 - Specificity",y = "Sensitivity")

rocobj_val <-rocobj

RIDGE model (all the available features,43)

Train

Tune the model

validation_tpm <-validation_tpm_forRidge[rownames(validation_tpm_forRidge) %in% c(up_genes$gene,dw_genes$gene),]
train_tpm <-train_tpm_forRidge[rownames(train_tpm_forRidge) %in% rownames(validation_tpm),]  

trainCtrl.rpcv <- trainControl(method = "LOOCV", number = 1,
                               verboseIter = TRUE, returnData = FALSE,
                               savePredictions = TRUE,        
                               classProbs = TRUE)

Train the model and save the coefficients

nCores <- 100 #change number if needed
registerDoMC(nCores)

train_dataset <-t(train_tpm)
train <- train(train_dataset,
                 y = ClassTRfull,
                 method = "glmnet", 
                 trControl = trainCtrl.rpcv,
                 metric = "Accuracy", allowParallel=TRUE,
                 tuneGrid = expand.grid(alpha = 0, #ridge
                                        lambda = seq(0.001,1,by = 0.001)))
cf_all <-coef(train$finalModel, train$finalModel$lambdaOpt)[,1][-1]

Compute score (sT) for Reference dataset using coefficients from the Ridge model

score_df_all_tr <-data.frame(patient=colnames(train_tpm), score=NA, class=ClassTRfull)

train_sc_all <-t(train_tpm[names(cf_all),])

sc1 <-rowSums(t(apply(train_sc_all,1,function(x) x*cf_all))) 

rownames(score_df_all_tr) <-score_df_all_tr[,1]
score_df_all_tr <-score_df_all_tr[names(sc1),]
score_df_all_tr$score <-sc1
head(sc1)
##   MRI001   MRI002   MRI003   MRI004   MRI005   MRI006 
## 7.028785 2.452862 4.865943 3.555113 4.605602 3.928878

Compute score (sT) for Validation dataset using coefficients from the Ridge model

score_df_all_val <-data.frame(patient=colnames(validation_tpm), score=NA, class=ClassV)

val_sc_all <-t(validation_tpm[names(cf_all),])

sc <-rowSums(t(apply(val_sc_all,1,function(x) x*cf_all))) 

rownames(score_df_all_val) <-score_df_all_val[,1]
score_df_all_val <-score_df_all_val[names(sc),]
score_df_all_val$score <-sc
head(sc)
## Normal_Cart_10_8  Normal_Cart_2_2  Normal_Cart_3_3  Normal_Cart_4_4 
##         5.669915         7.437420         3.869674         6.863030 
##  Normal_Cart_5_5  Normal_Cart_6_6 
##         5.256593         6.175890

Performance analysis: ROC curve on Validation dataset for sT score

score_df_all_val <-score_df_all_val[order(score_df_all_val$score, decreasing=FALSE),]

rocobj_val_all<- roc(score_df_all_val$class,score_df_all_val$score) 

auc <-round(rocobj_val_all$auc,3)

ggroc(rocobj_val_all, size = 2, col="dodgerblue", legacy.axes = TRUE) +
  theme_bw()+
  ggtitle(paste0('ROC Curve for score T (sT) on Validation dataset ', '(AUC = ', auc, ')'))+
  labs(x = "1 - Specificity",y = "Sensitivity")

Compare the ROC curves on validation dataset to assess the efficacy of the sR prediction and apply DeLong’s test

list_rocs <-list(rocobj_val,rocobj_val_all)
names(list_rocs) <-c("Score R","Score T")

auc <- lapply(list_rocs, function(x) round(x$auc,3))

auc[1]
## $`Score R`
## [1] 0.875
auc[2]
## $`Score T`
## [1] 0.922
(rocts <-roc.test(rocobj_val,rocobj_val_all))
## 
##  DeLong's test for two ROC curves
## 
## data:  rocobj_val and rocobj_val_all
## D = -0.66741, df = 69.259, p-value = 0.5067
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.8750000   0.9222222
ggroc(list_rocs, size = 2,legacy.axes = TRUE) +
    theme_bw()+
    annotate("text", 0.75, 0.44,  label=expression("AUC 8 features - score s"["R"] : 0.875) ,size=7)+
    annotate("text", 0.75,0.4, label=expression("AUC 43 features - score s"["T"] : 0.922) ,size=7)+
    labs(x = "1 - Specificity",y = "Sensitivity",color='Models')+
    theme(legend.title=element_text(size=16),legend.text=element_text(size=14))+
       guides(Models = guide_legend(override.aes = list(size = 4)))

Save resulting scores for the two models and the two datasets

list_scores <-list(score_df_all_tr, score_df_all_val, score_df_train, score_df_val)
names(list_scores) <-c("scoreT Training","scoreT Valid","scoreR Training","scoreR Valid")
save(list_scores,file=paste0(MEDIAsave,"list_scores.RData"))